In [98]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import cross_val_score
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.decomposition import PCA
import gc
In [99]:
#--------------------------------------------------------------------------------------
# Visualization of measurement of results
def plot_metrics_from_wide_df(gram_name,results_df):
"""
Plots all metrics from a wide-format DataFrame.
Args:
results_df (DataFrame): Wide-format DataFrame with columns:
- 'model': Model name.
- 'sampling': Sampling method.
- Metric columns (e.g., 'acc', 'prec', 'recall').
"""
# Metrik sütunlarını seç (model ve sampling haricindeki tüm sütunlar metrik olarak kabul edilir)
metrics = [col for col in results_df.columns if col not in ['model', 'sampling']]
for metric in metrics:
fig, ax = plt.subplots(figsize=(12, 8))
# Pivot tablosu oluÅŸtur
pivoted_data = results_df.pivot(index='sampling', columns='model', values=metric)
# Çubuk grafik çiz
pivoted_data.plot(kind='bar', ax=ax, alpha=0.8, edgecolor='black')
plt.title(f'{gram_name} {metric.replace("_", " ").capitalize()} by Model and Sampling')
plt.ylabel(metric.replace("_", " ").capitalize())
plt.xlabel('Sampling Method')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Model')
plt.tight_layout()
plt.show()
In [100]:
def plot_class_distribution(y,title):
unique_name_count = y.value_counts()
print(unique_name_count)
etiket_mapping = {0:'hiçbiri', 1:'nefret', 2:'saldırgan'}
# Index'i sayısal değerlere eşleme
unique_name_count.index = unique_name_count.index.map(etiket_mapping)
# Çubuk grafik çizimi
plt.bar(unique_name_count.index.tolist(), unique_name_count, color='skyblue', edgecolor='black')
plt.xticks(rotation=90)
plt.title(f"{title} dataset")
plt.xlabel("unique")
plt.ylabel("number")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
In [101]:
def evaluate_model_with_metrics(gram_name,model_name,sampling,model, X_train, y_train, X_test, y_test):
# Train the model
model.fit(X_train, y_train)
# Cross-validation scores for training accuracy
train_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
training_accuracy = np.mean(train_scores)
# Validation accuracy on test set
y_pred = model.predict(X_test)
y_pred = y_pred.ravel() # or y_pred.flatten()
validation_accuracy = accuracy_score(y_test, y_pred)
# Additional metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='weighted')
recall = recall_score(y_test, y_pred,average='weighted')
f1 = f1_score(y_test, y_pred,average='weighted')
print(f"\n{model_name},{sampling},{gram_name}:")
print("accuracy:",accuracy)
print("precision:",precision)
print("recall:",recall)
print("f1:",f1)
print("Training Accuracy:", training_accuracy)
print("Validation Accuracy:", validation_accuracy)
return accuracy,precision, recall, f1, training_accuracy, validation_accuracy
In [102]:
#-------------------------------------------------------------------------
def feature_unionAndmodel_training(gram_name,feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels):
X_train_features = feature_union.fit_transform(X_train)
X_test_features = feature_union.transform(X_test)
plot_class_distribution(y_train, f'{gram_name} Original Training Data Class Distribution')
print("Original Training Data Class Distributionı:", Counter(y_train))
under_sampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train_features, y_train)
plot_class_distribution(y_train_under,f'{gram_name} After Under-Sampling,Training Data Class Distribution')
print("After Under-Sampling,Training Data Class Distribution:", Counter(y_train_under))
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train_features, y_train)
print("After Over-Sampling, Training Data Class Distribution:", Counter(y_train_over))
plot_class_distribution(y_train_over,f'{gram_name} After Over-Sampling, Training Data Class Distribution')
smote = SMOTE(sampling_strategy='auto',random_state=42, k_neighbors=2)
X_train_smote, y_train_smote = smote.fit_resample(X_train_features, y_train,)
print("After SMOTE ,Training Data Class Distributionı:", Counter(y_train_smote))
plot_class_distribution(y_train_smote, f'{gram_name} After SMOTE ,Training Data Class Distributionı')
# Enhanced results dictionary
results = {
'model': [],
'sampling': [],
'accuracy': [],
'precision': [],
'recall': [],
'f1_score': [],
'training_accuracy': [],
'validation_accuracy': []
}
#ML ALG.
# Iterate through models and sampling methods
for X_train_res, y_train_res, sampling in [
(X_train_under, y_train_under, 'Under-Sampling'),
(X_train_over, y_train_over, 'Over-Sampling'),
(X_train_smote, y_train_smote, 'SMOTE')
]:
for model_name, model in modelsAndNames:
print(f"\n{gram_name}:")
accuracy,precision, recall, f1, training_accuracy, validation_accuracy = evaluate_model_with_metrics(
gram_name,model_name,sampling,
model,
X_train_res, y_train_res,
X_test_features, y_test
)
results['model'].append(model_name)
results['sampling'].append(sampling)
results['accuracy'].append(accuracy)
results['precision'].append(precision)
results['recall'].append(recall)
results['f1_score'].append(f1)
results['training_accuracy'].append(training_accuracy)
results['validation_accuracy'].append(validation_accuracy)
# Convert results to DataFrame
results_df = pd.DataFrame(results)
# Display results
print(results_df)
# Call the plotting function
plot_metrics_from_wide_df(gram_name,results_df)
results_df.to_csv(f'{gram_name}.csv', index=False)
return results_df
In [103]:
import warnings
warnings.filterwarnings('ignore')
import sklearn
print(sklearn.__version__)
import numpy as np
print(np.__version__)
#import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from collections import Counter
import xgboost as xgb
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
import csv
from catboost import CatBoostClassifier
try:
data = pd.read_excel("temizlenmis_ve_islenmis_veriler.xlsx", engine='openpyxl')
except Exception as e:
print(f"Dönüştürme sırasında bir hata oluştu: {e}")
#data = pd.read_csv("data_Llama.csv")
data = data.fillna("hiçbiri")
#data.drop(columns=['row_id'], inplace=True)# row index aynı indexte olan değerler,aynı değilleri.
data = data[["tweet","etiket"]]
print(data['etiket'].value_counts())
tweets = data['tweet']
labels = data["etiket"]
#--------------------------------------------------------------------------------------
print("Orijinal veri sınıf dağılımı:", Counter(labels))
1.4.2
1.26.4
etiket
hiçbiri 7722
nefret 2336
saldırgan 166
Name: count, dtype: int64
Orijinal veri sınıf dağılımı: Counter({'hiçbiri': 7722, 'nefret': 2336, 'saldırgan': 166})
In [104]:
# Modeller--------------------------------------------------------------------------------------
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist',max_depth=6, gpu_id=0, random_state=42, use_label_encoder=False, n_estimators=50)
catB_model = CatBoostClassifier(iterations=50, learning_rate=0.1, depth=4, verbose=0)# (XGBoost ve CatBoost)
ann_mlpc_sgd = MLPClassifier(solver='sgd', alpha=1e-5, activation='relu', hidden_layer_sizes=(20, 10, 5),max_iter=50, learning_rate='adaptive', random_state=7)# MLPClassifier (Yapay Sinir Ağı Sınıflandırıcı)
extraTC = ExtraTreesClassifier(n_estimators=100, random_state=7)# ExtraTreesClassifier (Ekstra Karar Ağaçları)
modelsAndNames = [
('XGBoost', xgb_model),
('CatBoostC', catB_model),
('MLPC-sgd', ann_mlpc_sgd),
('ExtraTreesClassifier', extraTC)
]
#--------------------------------------------------------------------------------------
In [105]:
# Kategorik etiketleri sayısal verilere dönüştürme
label_mapping = {'nefret': 1, 'hiçbiri': 0, 'saldırgan': 2}
labels = data['etiket'].map(label_mapping)
#
result_dic ={}
len_labels = len(set(labels)) #for num_classes
# Veriyi eğitim ve test setine ayırma
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)
In [106]:
#word unigram and bigram ,char bigram and trigram------------------------------
word_unigram_count = CountVectorizer(ngram_range=(1, 1), analyzer='word') # Kelime bazlı unigram
word_unigram_tfidf = TfidfVectorizer(ngram_range=(1, 1), analyzer='word') # Kelime bazlı unigram TF-IDF
word_bigram_count = CountVectorizer(ngram_range=(2, 2), analyzer='word') # Kelime bazlı bigram
word_bigram_tfidf = TfidfVectorizer(ngram_range=(2, 2), analyzer='word') # Kelime bazlı bigram TF-IDF
# Karakter bazlı bigram tanımları
char_bigram_count = CountVectorizer(ngram_range=(2, 2), analyzer='char') # Karakter bazlı bigram
char_bigram_tfidf = TfidfVectorizer(ngram_range=(2, 2), analyzer='char') # Karakter bazlı bigram TF-IDF
# Karakter bazlı trigram tanımları
char_trigram_count = CountVectorizer(ngram_range=(3, 3), analyzer='char') # Karakter bazlı trigram
char_trigram_tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer='char') # Karakter bazlı trigram TF-IDF
word unigram-------------------------------------------¶
In [108]:
feature_union = FeatureUnion([
("word_unigram_count", word_unigram_count),
("word_unigram_tfidf", word_unigram_tfidf)
])
In [109]:
results_df = feature_unionAndmodel_training("Word_Unigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["Word_Unigram"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
Word_Unigram:
XGBoost,Under-Sampling,Word_Unigram:
accuracy: 0.5095354523227383
precision: 0.6575901117758006
recall: 0.5095354523227383
f1: 0.5646169098292118
Training Accuracy: 0.3695652173913044
Validation Accuracy: 0.5095354523227383
Word_Unigram:
CatBoostC,Under-Sampling,Word_Unigram:
accuracy: 0.6440097799511002
precision: 0.7080178858853468
recall: 0.6440097799511002
f1: 0.6674804861339697
Training Accuracy: 0.47826086956521735
Validation Accuracy: 0.6440097799511002
Word_Unigram:
MLPC-sgd,Under-Sampling,Word_Unigram:
accuracy: 0.7466992665036675
precision: 0.557559794597115
recall: 0.7466992665036675
f1: 0.6384153303197649
Training Accuracy: 0.3333333333333333
Validation Accuracy: 0.7466992665036675
Word_Unigram:
ExtraTreesClassifier,Under-Sampling,Word_Unigram:
accuracy: 0.6332518337408313
precision: 0.7261700861211683
recall: 0.6332518337408313
f1: 0.6652686178056041
Training Accuracy: 0.47826086956521735
Validation Accuracy: 0.6332518337408313
Word_Unigram:
XGBoost,Over-Sampling,Word_Unigram:
accuracy: 0.6381418092909535
precision: 0.7184951030066614
recall: 0.6381418092909535
f1: 0.6588627681405055
Training Accuracy: 0.4382566585956416
Validation Accuracy: 0.6381418092909535
Word_Unigram:
CatBoostC,Over-Sampling,Word_Unigram:
accuracy: 0.6581907090464547
precision: 0.727098286516982
recall: 0.6581907090464547
f1: 0.6823021112184453
Training Accuracy: 0.6886736615550175
Validation Accuracy: 0.6581907090464547
Word_Unigram:
MLPC-sgd,Over-Sampling,Word_Unigram:
accuracy: 0.8019559902200489
precision: 0.7874559613439057
recall: 0.8019559902200489
f1: 0.7918328320509477
Training Accuracy: 0.9390368576809255
Validation Accuracy: 0.8019559902200489
Word_Unigram:
ExtraTreesClassifier,Over-Sampling,Word_Unigram:
accuracy: 0.8273838630806846
precision: 0.8211581691741373
recall: 0.8273838630806846
f1: 0.7955818168049512
Training Accuracy: 0.9839117567931127
Validation Accuracy: 0.8273838630806846
Word_Unigram:
XGBoost,SMOTE,Word_Unigram:
accuracy: 0.6205378973105135
precision: 0.6508394808079272
recall: 0.6205378973105135
f1: 0.6090517132553599
Training Accuracy: 0.455044390637611
Validation Accuracy: 0.6205378973105135
Word_Unigram:
CatBoostC,SMOTE,Word_Unigram:
accuracy: 0.7403422982885085
precision: 0.7170896335321763
recall: 0.7403422982885085
f1: 0.7249969379249598
Training Accuracy: 0.8239978477266613
Validation Accuracy: 0.7403422982885085
Word_Unigram:
MLPC-sgd,SMOTE,Word_Unigram:
accuracy: 0.8058679706601467
precision: 0.791429485059973
recall: 0.8058679706601467
f1: 0.7953933164260065
Training Accuracy: 0.933871401668012
Validation Accuracy: 0.8058679706601467
Word_Unigram:
ExtraTreesClassifier,SMOTE,Word_Unigram:
accuracy: 0.8273838630806846
precision: 0.8228334102571414
recall: 0.8273838630806846
f1: 0.7993982645123894
Training Accuracy: 0.9652407855797686
Validation Accuracy: 0.8273838630806846
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.509535 0.657590 0.509535
1 CatBoostC Under-Sampling 0.644010 0.708018 0.644010
2 MLPC-sgd Under-Sampling 0.746699 0.557560 0.746699
3 ExtraTreesClassifier Under-Sampling 0.633252 0.726170 0.633252
4 XGBoost Over-Sampling 0.638142 0.718495 0.638142
5 CatBoostC Over-Sampling 0.658191 0.727098 0.658191
6 MLPC-sgd Over-Sampling 0.801956 0.787456 0.801956
7 ExtraTreesClassifier Over-Sampling 0.827384 0.821158 0.827384
8 XGBoost SMOTE 0.620538 0.650839 0.620538
9 CatBoostC SMOTE 0.740342 0.717090 0.740342
10 MLPC-sgd SMOTE 0.805868 0.791429 0.805868
11 ExtraTreesClassifier SMOTE 0.827384 0.822833 0.827384
f1_score training_accuracy validation_accuracy
0 0.564617 0.369565 0.509535
1 0.667480 0.478261 0.644010
2 0.638415 0.333333 0.746699
3 0.665269 0.478261 0.633252
4 0.658863 0.438257 0.638142
5 0.682302 0.688674 0.658191
6 0.791833 0.939037 0.801956
7 0.795582 0.983912 0.827384
8 0.609052 0.455044 0.620538
9 0.724997 0.823998 0.740342
10 0.795393 0.933871 0.805868
11 0.799398 0.965241 0.827384
Word bigram-------------------------------------------¶
In [111]:
feature_union = FeatureUnion([
("word_bigram_count", word_bigram_count),
("word_bigram_tfidf", word_bigram_tfidf)
])
In [112]:
feature_unionAndmodel_training("Word_bigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["Word_bigram"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
Word_bigram:
XGBoost,Under-Sampling,Word_bigram:
accuracy: 0.7447432762836186
precision: 0.5574622465122969
recall: 0.7447432762836186
f1: 0.6376355384833673
Training Accuracy: 0.36231884057971014
Validation Accuracy: 0.7447432762836186
Word_bigram:
CatBoostC,Under-Sampling,Word_bigram:
accuracy: 0.7383863080684596
precision: 0.7182721795030548
recall: 0.7383863080684596
f1: 0.6391073421502371
Training Accuracy: 0.35748792270531404
Validation Accuracy: 0.7383863080684596
Word_bigram:
MLPC-sgd,Under-Sampling,Word_bigram:
accuracy: 0.013691931540342298
precision: 0.0001874689893054202
recall: 0.013691931540342298
f1: 0.0003698736933232844
Training Accuracy: 0.3333333333333333
Validation Accuracy: 0.013691931540342298
Word_bigram:
ExtraTreesClassifier,Under-Sampling,Word_bigram:
accuracy: 0.3095354523227384
precision: 0.6727032185641018
recall: 0.3095354523227384
f1: 0.2534120249048235
Training Accuracy: 0.3913043478260869
Validation Accuracy: 0.3095354523227384
Word_bigram:
XGBoost,Over-Sampling,Word_bigram:
accuracy: 0.2396088019559902
precision: 0.05741237797478494
recall: 0.2396088019559902
f1: 0.09262983270882462
Training Accuracy: 0.3449018025289212
Validation Accuracy: 0.2396088019559902
Word_bigram:
CatBoostC,Over-Sampling,Word_bigram:
accuracy: 0.2371638141809291
precision: 0.6181486862841096
recall: 0.2371638141809291
f1: 0.09890916658396938
Training Accuracy: 0.6075329566854991
Validation Accuracy: 0.2371638141809291
Word_bigram:
MLPC-sgd,Over-Sampling,Word_bigram:
accuracy: 0.7383863080684596
precision: 0.6230981692527701
recall: 0.7383863080684596
f1: 0.6440928476812778
Training Accuracy: 0.664030131826742
Validation Accuracy: 0.7383863080684596
Word_bigram:
ExtraTreesClassifier,Over-Sampling,Word_bigram:
accuracy: 0.7897310513447433
precision: 0.7858351199314055
recall: 0.7897310513447433
f1: 0.7393722747828305
Training Accuracy: 0.9837503362927092
Validation Accuracy: 0.7897310513447433
Word_bigram:
XGBoost,SMOTE,Word_bigram:
accuracy: 0.24009779951100244
precision: 0.8041397327259939
recall: 0.24009779951100244
f1: 0.09364374255766049
Training Accuracy: 0.39009954264191543
Validation Accuracy: 0.24009779951100244
Word_bigram:
CatBoostC,SMOTE,Word_bigram:
accuracy: 0.7506112469437652
precision: 0.7995415197371188
recall: 0.7506112469437652
f1: 0.6484315697478267
Training Accuracy: 0.8844767285445251
Validation Accuracy: 0.7506112469437652
Word_bigram:
MLPC-sgd,SMOTE,Word_bigram:
accuracy: 0.7452322738386308
precision: 0.6822035740789999
recall: 0.7452322738386308
f1: 0.6498933479503115
Training Accuracy: 0.7736346516007533
Validation Accuracy: 0.7452322738386308
Word_bigram:
ExtraTreesClassifier,SMOTE,Word_bigram:
accuracy: 0.7887530562347188
precision: 0.7842287144906595
recall: 0.7887530562347188
f1: 0.7387461571157363
Training Accuracy: 0.9511433952111918
Validation Accuracy: 0.7887530562347188
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.744743 0.557462 0.744743
1 CatBoostC Under-Sampling 0.738386 0.718272 0.738386
2 MLPC-sgd Under-Sampling 0.013692 0.000187 0.013692
3 ExtraTreesClassifier Under-Sampling 0.309535 0.672703 0.309535
4 XGBoost Over-Sampling 0.239609 0.057412 0.239609
5 CatBoostC Over-Sampling 0.237164 0.618149 0.237164
6 MLPC-sgd Over-Sampling 0.738386 0.623098 0.738386
7 ExtraTreesClassifier Over-Sampling 0.789731 0.785835 0.789731
8 XGBoost SMOTE 0.240098 0.804140 0.240098
9 CatBoostC SMOTE 0.750611 0.799542 0.750611
10 MLPC-sgd SMOTE 0.745232 0.682204 0.745232
11 ExtraTreesClassifier SMOTE 0.788753 0.784229 0.788753
f1_score training_accuracy validation_accuracy
0 0.637636 0.362319 0.744743
1 0.639107 0.357488 0.738386
2 0.000370 0.333333 0.013692
3 0.253412 0.391304 0.309535
4 0.092630 0.344902 0.239609
5 0.098909 0.607533 0.237164
6 0.644093 0.664030 0.738386
7 0.739372 0.983750 0.789731
8 0.093644 0.390100 0.240098
9 0.648432 0.884477 0.750611
10 0.649893 0.773635 0.745232
11 0.738746 0.951143 0.788753
char bigram-------------------------------------------¶
In [114]:
feature_union = FeatureUnion([
("char_bigram_count", char_bigram_count),
("char_bigram_tfidf", char_bigram_tfidf)
])
In [115]:
feature_unionAndmodel_training("char_bigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["char_bigram"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
char_bigram:
XGBoost,Under-Sampling,char_bigram:
accuracy: 0.6264058679706601
precision: 0.6347017545271905
recall: 0.6264058679706601
f1: 0.6303623506936571
Training Accuracy: 0.3333333333333333
Validation Accuracy: 0.6264058679706601
char_bigram:
CatBoostC,Under-Sampling,char_bigram:
accuracy: 0.5119804400977995
precision: 0.7012931523157435
recall: 0.5119804400977995
f1: 0.5733309891699646
Training Accuracy: 0.45893719806763283
Validation Accuracy: 0.5119804400977995
char_bigram:
MLPC-sgd,Under-Sampling,char_bigram:
accuracy: 0.20244498777506112
precision: 0.6621240364270823
recall: 0.20244498777506112
f1: 0.24674215185157422
Training Accuracy: 0.38647342995169076
Validation Accuracy: 0.20244498777506112
char_bigram:
ExtraTreesClassifier,Under-Sampling,char_bigram:
accuracy: 0.49339853300733494
precision: 0.7144449205645036
recall: 0.49339853300733494
f1: 0.5577858356038657
Training Accuracy: 0.5144927536231885
Validation Accuracy: 0.49339853300733494
char_bigram:
XGBoost,Over-Sampling,char_bigram:
accuracy: 0.2938875305623472
precision: 0.6309468584679232
recall: 0.2938875305623472
f1: 0.22549109881644047
Training Accuracy: 0.33440947000269033
Validation Accuracy: 0.2938875305623472
char_bigram:
CatBoostC,Over-Sampling,char_bigram:
accuracy: 0.6669926650366749
precision: 0.7481551692230886
recall: 0.6669926650366749
f1: 0.6970050768681457
Training Accuracy: 0.7357546408393866
Validation Accuracy: 0.6669926650366749
char_bigram:
MLPC-sgd,Over-Sampling,char_bigram:
accuracy: 0.7726161369193154
precision: 0.780073910877911
recall: 0.7726161369193154
f1: 0.7761152173651298
Training Accuracy: 0.8566585956416466
Validation Accuracy: 0.7726161369193154
char_bigram:
ExtraTreesClassifier,Over-Sampling,char_bigram:
accuracy: 0.7628361858190709
precision: 0.8063134639595191
recall: 0.7628361858190709
f1: 0.6746057809467082
Training Accuracy: 0.9862254506322303
Validation Accuracy: 0.7628361858190709
char_bigram:
XGBoost,SMOTE,char_bigram:
accuracy: 0.2396088019559902
precision: 0.05741237797478494
recall: 0.2396088019559902
f1: 0.09262983270882462
Training Accuracy: 0.36863061608824316
Validation Accuracy: 0.2396088019559902
char_bigram:
CatBoostC,SMOTE,char_bigram:
accuracy: 0.7735941320293398
precision: 0.7528107491638911
recall: 0.7735941320293398
f1: 0.7411515592040571
Training Accuracy: 0.8174334140435836
Validation Accuracy: 0.7735941320293398
char_bigram:
MLPC-sgd,SMOTE,char_bigram:
accuracy: 0.776039119804401
precision: 0.7779083451197722
recall: 0.776039119804401
f1: 0.7768841825426179
Training Accuracy: 0.8709712133440947
Validation Accuracy: 0.776039119804401
char_bigram:
ExtraTreesClassifier,SMOTE,char_bigram:
accuracy: 0.7799511002444988
precision: 0.7956635550575962
recall: 0.7799511002444988
f1: 0.7144830039226291
Training Accuracy: 0.9640570352434757
Validation Accuracy: 0.7799511002444988
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.626406 0.634702 0.626406
1 CatBoostC Under-Sampling 0.511980 0.701293 0.511980
2 MLPC-sgd Under-Sampling 0.202445 0.662124 0.202445
3 ExtraTreesClassifier Under-Sampling 0.493399 0.714445 0.493399
4 XGBoost Over-Sampling 0.293888 0.630947 0.293888
5 CatBoostC Over-Sampling 0.666993 0.748155 0.666993
6 MLPC-sgd Over-Sampling 0.772616 0.780074 0.772616
7 ExtraTreesClassifier Over-Sampling 0.762836 0.806313 0.762836
8 XGBoost SMOTE 0.239609 0.057412 0.239609
9 CatBoostC SMOTE 0.773594 0.752811 0.773594
10 MLPC-sgd SMOTE 0.776039 0.777908 0.776039
11 ExtraTreesClassifier SMOTE 0.779951 0.795664 0.779951
f1_score training_accuracy validation_accuracy
0 0.630362 0.333333 0.626406
1 0.573331 0.458937 0.511980
2 0.246742 0.386473 0.202445
3 0.557786 0.514493 0.493399
4 0.225491 0.334409 0.293888
5 0.697005 0.735755 0.666993
6 0.776115 0.856659 0.772616
7 0.674606 0.986225 0.762836
8 0.092630 0.368631 0.239609
9 0.741152 0.817433 0.773594
10 0.776884 0.870971 0.776039
11 0.714483 0.964057 0.779951
char trigram-------------------------------------------¶
In [117]:
feature_union = FeatureUnion([
("char_trigram_count", char_trigram_count),
("char_trigram_tfidf", char_trigram_tfidf)
])
In [118]:
feature_unionAndmodel_training("char_trigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["char_trigram"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
char_trigram:
XGBoost,Under-Sampling,char_trigram:
accuracy: 0.7437652811735941
precision: 0.5575501150476575
recall: 0.7437652811735941
f1: 0.637334222419797
Training Accuracy: 0.3357487922705314
Validation Accuracy: 0.7437652811735941
char_trigram:
CatBoostC,Under-Sampling,char_trigram:
accuracy: 0.5266503667481662
precision: 0.6784295980459428
recall: 0.5266503667481662
f1: 0.5809084261817494
Training Accuracy: 0.4855072463768116
Validation Accuracy: 0.5266503667481662
char_trigram:
MLPC-sgd,Under-Sampling,char_trigram:
accuracy: 0.5114914425427873
precision: 0.6981192263978314
recall: 0.5114914425427873
f1: 0.56881666496536
Training Accuracy: 0.4106280193236715
Validation Accuracy: 0.5114914425427873
char_trigram:
ExtraTreesClassifier,Under-Sampling,char_trigram:
accuracy: 0.556479217603912
precision: 0.740135399322862
recall: 0.556479217603912
f1: 0.6139464031302234
Training Accuracy: 0.533816425120773
Validation Accuracy: 0.556479217603912
char_trigram:
XGBoost,Over-Sampling,char_trigram:
accuracy: 0.29682151589242056
precision: 0.6583538870591151
recall: 0.29682151589242056
f1: 0.3057447151651328
Training Accuracy: 0.3418348130212537
Validation Accuracy: 0.29682151589242056
char_trigram:
CatBoostC,Over-Sampling,char_trigram:
accuracy: 0.6963325183374083
precision: 0.753973885412959
recall: 0.6963325183374083
f1: 0.7193417392948225
Training Accuracy: 0.7485068603712671
Validation Accuracy: 0.6963325183374083
char_trigram:
MLPC-sgd,Over-Sampling,char_trigram:
accuracy: 0.832762836185819
precision: 0.8285802939423141
recall: 0.832762836185819
f1: 0.8303986865798468
Training Accuracy: 0.9494753833736885
Validation Accuracy: 0.832762836185819
char_trigram:
ExtraTreesClassifier,Over-Sampling,char_trigram:
accuracy: 0.8024449877750611
precision: 0.8251092212835346
recall: 0.8024449877750611
f1: 0.7510225224076662
Training Accuracy: 0.9863868711326339
Validation Accuracy: 0.8024449877750611
char_trigram:
XGBoost,SMOTE,char_trigram:
accuracy: 0.2396088019559902
precision: 0.05741237797478494
recall: 0.2396088019559902
f1: 0.09262983270882462
Training Accuracy: 0.39892386333064306
Validation Accuracy: 0.2396088019559902
char_trigram:
CatBoostC,SMOTE,char_trigram:
accuracy: 0.767237163814181
precision: 0.7453820528076922
recall: 0.767237163814181
f1: 0.749831730620505
Training Accuracy: 0.8355663169222493
Validation Accuracy: 0.767237163814181
char_trigram:
MLPC-sgd,SMOTE,char_trigram:
accuracy: 0.8352078239608802
precision: 0.8331876566028675
recall: 0.8352078239608802
f1: 0.8341407203725756
Training Accuracy: 0.9404896421845574
Validation Accuracy: 0.8352078239608802
char_trigram:
ExtraTreesClassifier,SMOTE,char_trigram:
accuracy: 0.79559902200489
precision: 0.8101592616492123
recall: 0.79559902200489
f1: 0.7426270168688798
Training Accuracy: 0.9597524885660479
Validation Accuracy: 0.79559902200489
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.743765 0.557550 0.743765
1 CatBoostC Under-Sampling 0.526650 0.678430 0.526650
2 MLPC-sgd Under-Sampling 0.511491 0.698119 0.511491
3 ExtraTreesClassifier Under-Sampling 0.556479 0.740135 0.556479
4 XGBoost Over-Sampling 0.296822 0.658354 0.296822
5 CatBoostC Over-Sampling 0.696333 0.753974 0.696333
6 MLPC-sgd Over-Sampling 0.832763 0.828580 0.832763
7 ExtraTreesClassifier Over-Sampling 0.802445 0.825109 0.802445
8 XGBoost SMOTE 0.239609 0.057412 0.239609
9 CatBoostC SMOTE 0.767237 0.745382 0.767237
10 MLPC-sgd SMOTE 0.835208 0.833188 0.835208
11 ExtraTreesClassifier SMOTE 0.795599 0.810159 0.795599
f1_score training_accuracy validation_accuracy
0 0.637334 0.335749 0.743765
1 0.580908 0.485507 0.526650
2 0.568817 0.410628 0.511491
3 0.613946 0.533816 0.556479
4 0.305745 0.341835 0.296822
5 0.719342 0.748507 0.696333
6 0.830399 0.949475 0.832763
7 0.751023 0.986387 0.802445
8 0.092630 0.398924 0.239609
9 0.749832 0.835566 0.767237
10 0.834141 0.940490 0.835208
11 0.742627 0.959752 0.795599
char bigram + char trigram-------------------------------------------¶
In [120]:
feature_union = FeatureUnion([
("word_bigram_count", word_bigram_count),
("word_bigram_tfidf", word_bigram_tfidf),
("char_trigram_count", char_trigram_count),
("char_trigram_tfidf", char_trigram_tfidf)
])
In [121]:
feature_unionAndmodel_training("word_tri+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["word_tri+char_tri"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
word_tri+char_tri:
XGBoost,Under-Sampling,word_tri+char_tri:
accuracy: 0.4102689486552567
precision: 0.6326681653246251
recall: 0.4102689486552567
f1: 0.4280388951475887
Training Accuracy: 0.3695652173913044
Validation Accuracy: 0.4102689486552567
word_tri+char_tri:
CatBoostC,Under-Sampling,word_tri+char_tri:
accuracy: 0.5545232273838631
precision: 0.6947910642326474
recall: 0.5545232273838631
f1: 0.605369518144594
Training Accuracy: 0.4879227053140096
Validation Accuracy: 0.5545232273838631
word_tri+char_tri:
MLPC-sgd,Under-Sampling,word_tri+char_tri:
accuracy: 0.5300733496332518
precision: 0.7261427094807171
recall: 0.5300733496332518
f1: 0.6065311265402523
Training Accuracy: 0.4806763285024154
Validation Accuracy: 0.5300733496332518
word_tri+char_tri:
ExtraTreesClassifier,Under-Sampling,word_tri+char_tri:
accuracy: 0.5491442542787286
precision: 0.747378496008514
recall: 0.5491442542787286
f1: 0.6112365787596845
Training Accuracy: 0.5386473429951691
Validation Accuracy: 0.5491442542787286
word_tri+char_tri:
XGBoost,Over-Sampling,word_tri+char_tri:
accuracy: 0.022004889975550123
precision: 0.8321424024011673
recall: 0.022004889975550123
f1: 0.019620963780231933
Training Accuracy: 0.35275760021522734
Validation Accuracy: 0.022004889975550123
word_tri+char_tri:
CatBoostC,Over-Sampling,word_tri+char_tri:
accuracy: 0.7085574572127139
precision: 0.7643728054902549
recall: 0.7085574572127139
f1: 0.7307394895252735
Training Accuracy: 0.749313962873285
Validation Accuracy: 0.7085574572127139
word_tri+char_tri:
MLPC-sgd,Over-Sampling,word_tri+char_tri:
accuracy: 0.8366748166259169
precision: 0.8320275601711585
recall: 0.8366748166259169
f1: 0.8340288412594014
Training Accuracy: 0.9533494753833738
Validation Accuracy: 0.8366748166259169
word_tri+char_tri:
ExtraTreesClassifier,Over-Sampling,word_tri+char_tri:
accuracy: 0.789242053789731
precision: 0.8176749955061983
recall: 0.789242053789731
f1: 0.7284874608985384
Training Accuracy: 0.9867635189669088
Validation Accuracy: 0.789242053789731
word_tri+char_tri:
XGBoost,SMOTE,word_tri+char_tri:
accuracy: 0.23667481662591688
precision: 0.05824082521378438
recall: 0.23667481662591688
f1: 0.09326248302091822
Training Accuracy: 0.47925746569814365
Validation Accuracy: 0.23667481662591688
word_tri+char_tri:
CatBoostC,SMOTE,word_tri+char_tri:
accuracy: 0.7750611246943765
precision: 0.7501122442982812
recall: 0.7750611246943765
f1: 0.7504386915824807
Training Accuracy: 0.8392789884315307
Validation Accuracy: 0.7750611246943765
word_tri+char_tri:
MLPC-sgd,SMOTE,word_tri+char_tri:
accuracy: 0.8366748166259169
precision: 0.830694918498291
recall: 0.8366748166259169
f1: 0.833306726716856
Training Accuracy: 0.9483454398708636
Validation Accuracy: 0.8366748166259169
word_tri+char_tri:
ExtraTreesClassifier,SMOTE,word_tri+char_tri:
accuracy: 0.7867970660146699
precision: 0.8155873610708242
recall: 0.7867970660146699
f1: 0.7238500112549312
Training Accuracy: 0.9601291364003228
Validation Accuracy: 0.7867970660146699
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.410269 0.632668 0.410269
1 CatBoostC Under-Sampling 0.554523 0.694791 0.554523
2 MLPC-sgd Under-Sampling 0.530073 0.726143 0.530073
3 ExtraTreesClassifier Under-Sampling 0.549144 0.747378 0.549144
4 XGBoost Over-Sampling 0.022005 0.832142 0.022005
5 CatBoostC Over-Sampling 0.708557 0.764373 0.708557
6 MLPC-sgd Over-Sampling 0.836675 0.832028 0.836675
7 ExtraTreesClassifier Over-Sampling 0.789242 0.817675 0.789242
8 XGBoost SMOTE 0.236675 0.058241 0.236675
9 CatBoostC SMOTE 0.775061 0.750112 0.775061
10 MLPC-sgd SMOTE 0.836675 0.830695 0.836675
11 ExtraTreesClassifier SMOTE 0.786797 0.815587 0.786797
f1_score training_accuracy validation_accuracy
0 0.428039 0.369565 0.410269
1 0.605370 0.487923 0.554523
2 0.606531 0.480676 0.530073
3 0.611237 0.538647 0.549144
4 0.019621 0.352758 0.022005
5 0.730739 0.749314 0.708557
6 0.834029 0.953349 0.836675
7 0.728487 0.986764 0.789242
8 0.093262 0.479257 0.236675
9 0.750439 0.839279 0.775061
10 0.833307 0.948345 0.836675
11 0.723850 0.960129 0.786797
char unigram + char trigram-------------------------------------------¶
In [123]:
feature_union = FeatureUnion([
("word_unigram_count", word_unigram_count),
("word_unigram_tfidf", word_unigram_tfidf),
("char_trigram_count", char_trigram_count),
("char_trigram_tfidf", char_trigram_tfidf)
])
In [124]:
feature_unionAndmodel_training("char_uni+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["char_uni+char_tri"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
char_uni+char_tri:
XGBoost,Under-Sampling,char_uni+char_tri:
accuracy: 0.6400977995110024
precision: 0.5871215502758061
recall: 0.6400977995110024
f1: 0.610624168789445
Training Accuracy: 0.37198067632850246
Validation Accuracy: 0.6400977995110024
char_uni+char_tri:
CatBoostC,Under-Sampling,char_uni+char_tri:
accuracy: 0.5202933985330074
precision: 0.6954071506598201
recall: 0.5202933985330074
f1: 0.5789420190665412
Training Accuracy: 0.4879227053140096
Validation Accuracy: 0.5202933985330074
char_uni+char_tri:
MLPC-sgd,Under-Sampling,char_uni+char_tri:
accuracy: 0.4356968215158924
precision: 0.6739252814384025
recall: 0.4356968215158924
f1: 0.5167196176640665
Training Accuracy: 0.4057971014492754
Validation Accuracy: 0.4356968215158924
char_uni+char_tri:
ExtraTreesClassifier,Under-Sampling,char_uni+char_tri:
accuracy: 0.5677261613691932
precision: 0.7466939546285011
recall: 0.5677261613691932
f1: 0.6234122817232957
Training Accuracy: 0.5652173913043478
Validation Accuracy: 0.5677261613691932
char_uni+char_tri:
XGBoost,Over-Sampling,char_uni+char_tri:
accuracy: 0.019559902200488997
precision: 0.7837063693019813
recall: 0.019559902200488997
f1: 0.014767866963510711
Training Accuracy: 0.3403282216841539
Validation Accuracy: 0.019559902200488997
char_uni+char_tri:
CatBoostC,Over-Sampling,char_uni+char_tri:
accuracy: 0.6992665036674817
precision: 0.7588717427719135
recall: 0.6992665036674817
f1: 0.7222379983523248
Training Accuracy: 0.7532956685499057
Validation Accuracy: 0.6992665036674817
char_uni+char_tri:
MLPC-sgd,Over-Sampling,char_uni+char_tri:
accuracy: 0.8283618581907091
precision: 0.8202322560693016
recall: 0.8283618581907091
f1: 0.8238097105427027
Training Accuracy: 0.9495291902071563
Validation Accuracy: 0.8283618581907091
char_uni+char_tri:
ExtraTreesClassifier,Over-Sampling,char_uni+char_tri:
accuracy: 0.7975550122249389
precision: 0.8130472030199646
recall: 0.7975550122249389
f1: 0.7445224962018224
Training Accuracy: 0.9868173258003766
Validation Accuracy: 0.7975550122249389
char_uni+char_tri:
XGBoost,SMOTE,char_uni+char_tri:
accuracy: 0.23374083129584353
precision: 0.05819951265725905
recall: 0.23374083129584353
f1: 0.09318732497031335
Training Accuracy: 0.43938660209846647
Validation Accuracy: 0.23374083129584353
char_uni+char_tri:
CatBoostC,SMOTE,char_uni+char_tri:
accuracy: 0.7726161369193154
precision: 0.7484608430347525
recall: 0.7726161369193154
f1: 0.7488497970840255
Training Accuracy: 0.8409470002690341
Validation Accuracy: 0.7726161369193154
char_uni+char_tri:
MLPC-sgd,SMOTE,char_uni+char_tri:
accuracy: 0.8273838630806846
precision: 0.8209420578880833
recall: 0.8273838630806846
f1: 0.8237890028579609
Training Accuracy: 0.940973903685768
Validation Accuracy: 0.8273838630806846
char_uni+char_tri:
ExtraTreesClassifier,SMOTE,char_uni+char_tri:
accuracy: 0.7970660146699267
precision: 0.8126595086967541
recall: 0.7970660146699267
f1: 0.7440900031427411
Training Accuracy: 0.9607210115684692
Validation Accuracy: 0.7970660146699267
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.640098 0.587122 0.640098
1 CatBoostC Under-Sampling 0.520293 0.695407 0.520293
2 MLPC-sgd Under-Sampling 0.435697 0.673925 0.435697
3 ExtraTreesClassifier Under-Sampling 0.567726 0.746694 0.567726
4 XGBoost Over-Sampling 0.019560 0.783706 0.019560
5 CatBoostC Over-Sampling 0.699267 0.758872 0.699267
6 MLPC-sgd Over-Sampling 0.828362 0.820232 0.828362
7 ExtraTreesClassifier Over-Sampling 0.797555 0.813047 0.797555
8 XGBoost SMOTE 0.233741 0.058200 0.233741
9 CatBoostC SMOTE 0.772616 0.748461 0.772616
10 MLPC-sgd SMOTE 0.827384 0.820942 0.827384
11 ExtraTreesClassifier SMOTE 0.797066 0.812660 0.797066
f1_score training_accuracy validation_accuracy
0 0.610624 0.371981 0.640098
1 0.578942 0.487923 0.520293
2 0.516720 0.405797 0.435697
3 0.623412 0.565217 0.567726
4 0.014768 0.340328 0.019560
5 0.722238 0.753296 0.699267
6 0.823810 0.949529 0.828362
7 0.744522 0.986817 0.797555
8 0.093187 0.439387 0.233741
9 0.748850 0.840947 0.772616
10 0.823789 0.940974 0.827384
11 0.744090 0.960721 0.797066
word unigram + word bigram + char trigram -------------------------------------------¶
In [126]:
feature_union = FeatureUnion([
("word_unigram_count", word_unigram_count),
("word_unigram_tfidf", word_unigram_tfidf),
("char_bigram_count", char_bigram_count),
("char_bigram_tfidf", char_bigram_tfidf),
("char_trigram_count", char_trigram_count),
("char_trigram_tfidf", char_trigram_tfidf)
])
In [127]:
feature_unionAndmodel_training("word_uni+word_bi+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["word_uni+word_bi+char_tri"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
word_uni+word_bi+char_tri:
XGBoost,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.2396088019559902
precision: 0.6010213048009339
recall: 0.2396088019559902
f1: 0.10066342121438328
Training Accuracy: 0.357487922705314
Validation Accuracy: 0.2396088019559902
word_uni+word_bi+char_tri:
CatBoostC,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.5246943765281173
precision: 0.697460324720474
recall: 0.5246943765281173
f1: 0.5850971267318494
Training Accuracy: 0.4613526570048309
Validation Accuracy: 0.5246943765281173
word_uni+word_bi+char_tri:
MLPC-sgd,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.4924205378973105
precision: 0.6964274470421332
recall: 0.4924205378973105
f1: 0.5582339864481477
Training Accuracy: 0.5048309178743962
Validation Accuracy: 0.4924205378973105
word_uni+word_bi+char_tri:
ExtraTreesClassifier,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.5691931540342299
precision: 0.756086457759344
recall: 0.5691931540342299
f1: 0.6261420613838502
Training Accuracy: 0.5434782608695653
Validation Accuracy: 0.5691931540342299
word_uni+word_bi+char_tri:
XGBoost,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.24449877750611246
precision: 0.5457468467360217
recall: 0.24449877750611246
f1: 0.10885840167008869
Training Accuracy: 0.36206618240516547
Validation Accuracy: 0.24449877750611246
word_uni+word_bi+char_tri:
CatBoostC,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.7178484107579463
precision: 0.7683158388263848
recall: 0.7178484107579463
f1: 0.7378813887877547
Training Accuracy: 0.7563626580575734
Validation Accuracy: 0.7178484107579463
word_uni+word_bi+char_tri:
MLPC-sgd,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.8381418092909535
precision: 0.8389920793674207
recall: 0.8381418092909535
f1: 0.8384964609802923
Training Accuracy: 0.9513586225450631
Validation Accuracy: 0.8381418092909535
word_uni+word_bi+char_tri:
ExtraTreesClassifier,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.7882640586797066
precision: 0.8143835519307563
recall: 0.7882640586797066
f1: 0.7261293406259197
Training Accuracy: 0.9862792574656981
Validation Accuracy: 0.7882640586797066
word_uni+word_bi+char_tri:
XGBoost,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.35794621026894863
precision: 0.6624535910948148
recall: 0.35794621026894863
f1: 0.361458297753706
Training Accuracy: 0.37928436911487756
Validation Accuracy: 0.35794621026894863
word_uni+word_bi+char_tri:
CatBoostC,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.7877750611246944
precision: 0.7735558230366292
recall: 0.7877750611246944
f1: 0.7649803870163941
Training Accuracy: 0.8362658057573312
Validation Accuracy: 0.7877750611246944
word_uni+word_bi+char_tri:
MLPC-sgd,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.8381418092909535
precision: 0.8394583973927293
recall: 0.8381418092909535
f1: 0.8387471869942711
Training Accuracy: 0.9442561205273069
Validation Accuracy: 0.8381418092909535
word_uni+word_bi+char_tri:
ExtraTreesClassifier,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.7951100244498778
precision: 0.8159438172725928
recall: 0.7951100244498778
f1: 0.739746258991426
Training Accuracy: 0.9653483992467043
Validation Accuracy: 0.7951100244498778
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.239609 0.601021 0.239609
1 CatBoostC Under-Sampling 0.524694 0.697460 0.524694
2 MLPC-sgd Under-Sampling 0.492421 0.696427 0.492421
3 ExtraTreesClassifier Under-Sampling 0.569193 0.756086 0.569193
4 XGBoost Over-Sampling 0.244499 0.545747 0.244499
5 CatBoostC Over-Sampling 0.717848 0.768316 0.717848
6 MLPC-sgd Over-Sampling 0.838142 0.838992 0.838142
7 ExtraTreesClassifier Over-Sampling 0.788264 0.814384 0.788264
8 XGBoost SMOTE 0.357946 0.662454 0.357946
9 CatBoostC SMOTE 0.787775 0.773556 0.787775
10 MLPC-sgd SMOTE 0.838142 0.839458 0.838142
11 ExtraTreesClassifier SMOTE 0.795110 0.815944 0.795110
f1_score training_accuracy validation_accuracy
0 0.100663 0.357488 0.239609
1 0.585097 0.461353 0.524694
2 0.558234 0.504831 0.492421
3 0.626142 0.543478 0.569193
4 0.108858 0.362066 0.244499
5 0.737881 0.756363 0.717848
6 0.838496 0.951359 0.838142
7 0.726129 0.986279 0.788264
8 0.361458 0.379284 0.357946
9 0.764980 0.836266 0.787775
10 0.838747 0.944256 0.838142
11 0.739746 0.965348 0.795110
word unigram + word bigram + char bigram + char trigram -------------------------------------------¶
In [129]:
feature_union = FeatureUnion([
("word_unigram_count", word_unigram_count),
("word_unigram_tfidf", word_unigram_tfidf),
("word_bigram_count", word_bigram_count),
("word_bigram_tfidf", word_bigram_tfidf),
("char_bigram_count", char_bigram_count),
("char_bigram_tfidf", char_bigram_tfidf),
("char_trigram_count", char_trigram_count),
("char_trigram_tfidf", char_trigram_tfidf)
])
In [130]:
results_df = feature_unionAndmodel_training("word_uni+word_bi+char_bi+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["word_uni+word_bi+char_bi+char_tri"] = results_df
etiket 0 6195 1 1846 2 138 Name: count, dtype: int64
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0 138
1 138
2 138
Name: count, dtype: int64
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1 6195
0 6195
2 6195
Name: count, dtype: int64
word_uni+word_bi+char_bi+char_tri:
XGBoost,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.013691931540342298
precision: 0.0001874689893054202
recall: 0.013691931540342298
f1: 0.0003698736933232844
Training Accuracy: 0.32850241545893716
Validation Accuracy: 0.013691931540342298
word_uni+word_bi+char_bi+char_tri:
CatBoostC,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.49779951100244496
precision: 0.69280455043867
recall: 0.49779951100244496
f1: 0.5567203737246547
Training Accuracy: 0.43719806763285024
Validation Accuracy: 0.49779951100244496
word_uni+word_bi+char_bi+char_tri:
MLPC-sgd,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.4963325183374083
precision: 0.6773985151234119
recall: 0.4963325183374083
f1: 0.5658100095215349
Training Accuracy: 0.4830917874396135
Validation Accuracy: 0.4963325183374083
word_uni+word_bi+char_bi+char_tri:
ExtraTreesClassifier,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.6019559902200489
precision: 0.7526797826663255
recall: 0.6019559902200489
f1: 0.6533982546175323
Training Accuracy: 0.5603864734299517
Validation Accuracy: 0.6019559902200489
word_uni+word_bi+char_bi+char_tri:
XGBoost,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.07383863080684597
precision: 0.5590891862039353
recall: 0.07383863080684597
f1: 0.11046800085182826
Training Accuracy: 0.35598601022329834
Validation Accuracy: 0.07383863080684597
word_uni+word_bi+char_bi+char_tri:
CatBoostC,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.7012224938875306
precision: 0.7619246811117365
recall: 0.7012224938875306
f1: 0.7249652562645343
Training Accuracy: 0.7583535108958838
Validation Accuracy: 0.7012224938875306
word_uni+word_bi+char_bi+char_tri:
MLPC-sgd,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.8420537897310514
precision: 0.837977812944816
recall: 0.8420537897310514
f1: 0.8398772181999302
Training Accuracy: 0.9549098735539413
Validation Accuracy: 0.8420537897310514
word_uni+word_bi+char_bi+char_tri:
ExtraTreesClassifier,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.7833740831295843
precision: 0.8107393367212573
recall: 0.7833740831295843
f1: 0.7177808334228963
Training Accuracy: 0.9866020984665053
Validation Accuracy: 0.7833740831295843
word_uni+word_bi+char_bi+char_tri:
XGBoost,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.20097799511002445
precision: 0.5930430834219854
recall: 0.20097799511002445
f1: 0.23997680954587133
Training Accuracy: 0.34210384718859294
Validation Accuracy: 0.20097799511002445
word_uni+word_bi+char_bi+char_tri:
CatBoostC,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.78239608801956
precision: 0.7639352682392094
recall: 0.78239608801956
f1: 0.7613290224414989
Training Accuracy: 0.8441216034436373
Validation Accuracy: 0.78239608801956
word_uni+word_bi+char_bi+char_tri:
MLPC-sgd,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.8469437652811735
precision: 0.8437840595842998
recall: 0.8469437652811735
f1: 0.8452462622973984
Training Accuracy: 0.9498520312079636
Validation Accuracy: 0.8469437652811735
word_uni+word_bi+char_bi+char_tri:
ExtraTreesClassifier,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.7907090464547677
precision: 0.8066973414488265
recall: 0.7907090464547677
f1: 0.733048315750241
Training Accuracy: 0.9628732849071832
Validation Accuracy: 0.7907090464547677
model sampling accuracy precision recall \
0 XGBoost Under-Sampling 0.013692 0.000187 0.013692
1 CatBoostC Under-Sampling 0.497800 0.692805 0.497800
2 MLPC-sgd Under-Sampling 0.496333 0.677399 0.496333
3 ExtraTreesClassifier Under-Sampling 0.601956 0.752680 0.601956
4 XGBoost Over-Sampling 0.073839 0.559089 0.073839
5 CatBoostC Over-Sampling 0.701222 0.761925 0.701222
6 MLPC-sgd Over-Sampling 0.842054 0.837978 0.842054
7 ExtraTreesClassifier Over-Sampling 0.783374 0.810739 0.783374
8 XGBoost SMOTE 0.200978 0.593043 0.200978
9 CatBoostC SMOTE 0.782396 0.763935 0.782396
10 MLPC-sgd SMOTE 0.846944 0.843784 0.846944
11 ExtraTreesClassifier SMOTE 0.790709 0.806697 0.790709
f1_score training_accuracy validation_accuracy
0 0.000370 0.328502 0.013692
1 0.556720 0.437198 0.497800
2 0.565810 0.483092 0.496333
3 0.653398 0.560386 0.601956
4 0.110468 0.355986 0.073839
5 0.724965 0.758354 0.701222
6 0.839877 0.954910 0.842054
7 0.717781 0.986602 0.783374
8 0.239977 0.342104 0.200978
9 0.761329 0.844122 0.782396
10 0.845246 0.949852 0.846944
11 0.733048 0.962873 0.790709
😊😊conclusion😊😊¶
In [132]:
def dictionary_to_dataframe(result_dic):
"""
Converts a dictionary into a DataFrame and adds the gram_name as a column.
Parameters:
result_dic (dict): Dictionary containing results with gram_name as keys.
Returns:
pd.DataFrame: Combined DataFrame with gram_name as a column.
"""
dataframes = []
for gram_name, metrics in result_dic.items():
temp_df = pd.DataFrame(metrics)
temp_df.insert(0, 'gram_name', gram_name)
dataframes.append(temp_df)
return pd.concat(dataframes, ignore_index=True)
def visualize_metrics(result_dic):
"""
Visualizes the metrics from a dictionary using grouped bar charts.
Parameters:
result_dic (dict): Dictionary containing results with gram_name as keys.
Returns:
None: Displays the visualization.
"""
# Convert dictionary to DataFrame
final_df = dictionary_to_dataframe(result_dic)
# Group by gram_name and calculate mean scores
grouped_df = final_df.groupby(['gram_name', 'sampling']).mean(numeric_only=True).reset_index()
# Metrics to plot
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'training_accuracy', 'validation_accuracy']
# Create figure and axes
fig, ax = plt.subplots(figsize=(14, 8))
# Number of gram names and metrics
gram_sampling_labels = grouped_df[['gram_name', 'sampling']].apply(lambda x: f"{x['gram_name']}\n({x['sampling']})", axis=1)
x = np.arange(len(gram_sampling_labels)) # The label locations
width = 0.12 # Adjusted width for more metrics
# Plot each metric as a separate set of bars
for i, metric in enumerate(metrics):
ax.bar(x + i * width, grouped_df[metric], width, label=metric.capitalize())
# Add labels, title, and legend
ax.set_xlabel('Gram Name and Sampling Method')
ax.set_ylabel('Score')
ax.set_title('Mean Scores by Gram Name, Sampling Method, and Metric')
ax.set_xticks(x + width * len(metrics) / 2)
ax.set_xticklabels(gram_sampling_labels, rotation=45, ha='right')
ax.legend()
# Add spacing between groups
plt.tight_layout()
plt.show()
In [133]:
visualize_metrics(result_dic)